home *** CD-ROM | disk | FTP | other *** search
/ MacHack 2000 / MacHack 2000.toast / pc / The Hacks / MacHacksBug / Python 1.5.2c1 / Demo / tkinter / www / sgmllib.pyc (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2000-06-23  |  12.1 KB  |  355 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 1.5)
  3.  
  4. import regex
  5. import string
  6. incomplete = regex.compile('<!-?\\|</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*\\|</?\\|' + '&#[a-zA-Z0-9]*\\|&[a-zA-Z][a-zA-Z0-9]*\\|&')
  7. entityref = regex.compile('&[a-zA-Z][a-zA-Z0-9]*[;.]')
  8. charref = regex.compile('&#[a-zA-Z0-9]+;')
  9. starttagopen = regex.compile('<[a-zA-Z]')
  10. endtag = regex.compile('</[a-zA-Z][a-zA-Z0-9]*[ \t\n]*>')
  11. commentopen = regex.compile('<!--')
  12.  
  13. class SGMLParser:
  14.     
  15.     def __init__(self):
  16.         self.reset()
  17.  
  18.     
  19.     def reset(self):
  20.         self.rawdata = ''
  21.         self.stack = []
  22.         self.nomoretags = 0
  23.         self.literal = 0
  24.  
  25.     
  26.     def setnomoretags(self):
  27.         self.nomoretags = self.literal = 1
  28.  
  29.     
  30.     def setliteral(self, *args):
  31.         self.literal = 1
  32.  
  33.     
  34.     def feed(self, data):
  35.         self.rawdata = self.rawdata + data
  36.         self.goahead(0)
  37.  
  38.     
  39.     def close(self):
  40.         self.goahead(1)
  41.  
  42.     
  43.     def goahead(self, end):
  44.         rawdata = self.rawdata
  45.         i = 0
  46.         n = len(rawdata)
  47.         while i < n:
  48.             if self.nomoretags:
  49.                 self.handle_data(rawdata[i:n])
  50.                 i = n
  51.                 break
  52.             
  53.             j = incomplete.search(rawdata, i)
  54.             if j < 0:
  55.                 j = n
  56.             
  57.             if i < j:
  58.                 self.handle_data(rawdata[i:j])
  59.             
  60.             i = j
  61.             if i == n:
  62.                 break
  63.             
  64.             if rawdata[i] == '<':
  65.                 if starttagopen.match(rawdata, i) >= 0:
  66.                     if self.literal:
  67.                         self.handle_data(rawdata[i])
  68.                         i = i + 1
  69.                         continue
  70.                     
  71.                     k = self.parse_starttag(i)
  72.                     if k < 0:
  73.                         break
  74.                     
  75.                     i = i + k
  76.                     continue
  77.                 
  78.                 k = endtag.match(rawdata, i)
  79.                 if k >= 0:
  80.                     j = i + k
  81.                     self.parse_endtag(rawdata[i:j])
  82.                     i = j
  83.                     self.literal = 0
  84.                     continue
  85.                 
  86.                 if commentopen.match(rawdata, i) >= 0:
  87.                     if self.literal:
  88.                         self.handle_data(rawdata[i])
  89.                         i = i + 1
  90.                         continue
  91.                     
  92.                     k = self.parse_comment(i)
  93.                     if k < 0:
  94.                         break
  95.                     
  96.                     i = i + k
  97.                     continue
  98.                 
  99.             elif rawdata[i] == '&':
  100.                 k = charref.match(rawdata, i)
  101.                 if k >= 0:
  102.                     j = i + k
  103.                     self.handle_charref(rawdata[i + 2:j - 1])
  104.                     i = j
  105.                     continue
  106.                 
  107.                 k = entityref.match(rawdata, i)
  108.                 if k >= 0:
  109.                     j = i + k
  110.                     self.handle_entityref(rawdata[i + 1:j - 1])
  111.                     i = j
  112.                     continue
  113.                 
  114.             else:
  115.                 raise RuntimeError, 'neither < nor & ??'
  116.             k = incomplete.match(rawdata, i)
  117.             if k < 0:
  118.                 raise RuntimeError, 'no incomplete match ??'
  119.             
  120.             j = i + k
  121.             if j == n:
  122.                 break
  123.             
  124.             self.handle_data(rawdata[i:j])
  125.             i = j
  126.         if end and i < n:
  127.             self.handle_data(rawdata[i:n])
  128.             i = n
  129.         
  130.         self.rawdata = rawdata[i:]
  131.  
  132.     
  133.     def parse_comment(self, i):
  134.         rawdata = self.rawdata
  135.         if rawdata[i:i + 4] != '<!--':
  136.             raise RuntimeError, 'unexpected call to handle_comment'
  137.         
  138.         
  139.         try:
  140.             j = string.index(rawdata, '--', i + 4)
  141.         except string.index_error:
  142.             return -1
  143.  
  144.         self.handle_comment(rawdata[i + 4:j])
  145.         j = j + 2
  146.         n = len(rawdata)
  147.         while j < n and rawdata[j] in ' \t\n':
  148.             j = j + 1
  149.         if j == n:
  150.             return -1
  151.         
  152.         if rawdata[j] == '>':
  153.             j = j + 1
  154.         else:
  155.             print '*** comment not terminated with >'
  156.             print repr(rawdata[j - 5:j]), '*!*', repr(rawdata[j:j + 5])
  157.         return j - i
  158.  
  159.     
  160.     def parse_starttag(self, i):
  161.         rawdata = self.rawdata
  162.         
  163.         try:
  164.             j = string.index(rawdata, '>', i)
  165.         except string.index_error:
  166.             return -1
  167.  
  168.         attrs = []
  169.         tagfind = regex.compile('[a-zA-Z][a-zA-Z0-9]*')
  170.         attrfind = regex.compile('[ \t\n]+\\([a-zA-Z][a-zA-Z0-9]*\\)' + '\\([ \t\n]*=[ \t\n]*' + '\\(\'[^\']*\';\\|"[^"]*"\\|[-a-zA-Z0-9./:+*%?!()_#]+\\)\\)?')
  171.         k = tagfind.match(rawdata, i + 1)
  172.         if k < 0:
  173.             raise RuntimeError, 'unexpected call to parse_starttag'
  174.         
  175.         k = i + 1 + k
  176.         tag = string.lower(rawdata[i + 1:k])
  177.         while k < j:
  178.             l = attrfind.match(rawdata, k)
  179.             if l < 0:
  180.                 break
  181.             
  182.             regs = attrfind.regs
  183.             (a1, b1) = regs[1]
  184.             (a2, b2) = regs[2]
  185.             (a3, b3) = regs[3]
  186.             attrname = rawdata[a1:b1]
  187.             if '=' in rawdata[k:k + l]:
  188.                 attrvalue = rawdata[a3:b3]
  189.                 if "'" == "'":
  190.                     pass
  191.                 elif not "'" == attrvalue[-1:]:
  192.                     if '"' == '"':
  193.                         pass
  194.                     elif '"' == attrvalue[-1:]:
  195.                         attrvalue = attrvalue[1:-1]
  196.                     
  197.                 else:
  198.                     attrvalue = ''
  199.             attrs.append(string.lower(attrname), attrvalue)
  200.             k = k + l
  201.         j = j + 1
  202.         
  203.         try:
  204.             method = getattr(self, 'start_' + tag)
  205.         except AttributeError:
  206.             
  207.             try:
  208.                 method = getattr(self, 'do_' + tag)
  209.             except AttributeError:
  210.                 self.unknown_starttag(tag, attrs)
  211.                 return j - i
  212.  
  213.             method(attrs)
  214.             return j - i
  215.  
  216.         self.stack.append(tag)
  217.         method(attrs)
  218.         return j - i
  219.  
  220.     
  221.     def parse_endtag(self, data):
  222.         if data[:2] != '</' or data[-1:] != '>':
  223.             raise RuntimeError, 'unexpected call to parse_endtag'
  224.         
  225.         tag = string.lower(string.strip(data[2:-1]))
  226.         
  227.         try:
  228.             method = getattr(self, 'end_' + tag)
  229.         except AttributeError:
  230.             self.unknown_endtag(tag)
  231.             return None
  232.  
  233.         if self.stack and self.stack[-1] == tag:
  234.             del self.stack[-1]
  235.         else:
  236.             print '*** Unbalanced </' + tag + '>'
  237.             print '*** Stack:', self.stack
  238.             found = None
  239.             for i in range(len(self.stack)):
  240.                 pass
  241.             
  242.             if found != None:
  243.                 del self.stack[found:]
  244.             
  245.         method()
  246.  
  247.     
  248.     def handle_charref(self, name):
  249.         
  250.         try:
  251.             n = string.atoi(name)
  252.         except string.atoi_error:
  253.             self.unknown_charref(name)
  254.             return None
  255.  
  256.         if not None if n <= n else n <= 255:
  257.             self.unknown_charref(name)
  258.             return None
  259.         
  260.         self.handle_data(chr(n))
  261.  
  262.     entitydefs = {
  263.         'lt': '<',
  264.         'gt': '>',
  265.         'amp': '&',
  266.         'quot': '"',
  267.         'apos': "'" }
  268.     
  269.     def handle_entityref(self, name):
  270.         table = self.__class__.entitydefs
  271.         name = string.lower(name)
  272.         if table.has_key(name):
  273.             self.handle_data(table[name])
  274.         else:
  275.             self.unknown_entityref(name)
  276.             return None
  277.  
  278.     
  279.     def handle_data(self, data):
  280.         pass
  281.  
  282.     
  283.     def handle_comment(self, data):
  284.         pass
  285.  
  286.     
  287.     def unknown_starttag(self, tag, attrs):
  288.         pass
  289.  
  290.     
  291.     def unknown_endtag(self, tag):
  292.         pass
  293.  
  294.     
  295.     def unknown_charref(self, ref):
  296.         pass
  297.  
  298.     
  299.     def unknown_entityref(self, ref):
  300.         pass
  301.  
  302.  
  303.  
  304. class TestSGML(SGMLParser):
  305.     
  306.     def handle_data(self, data):
  307.         r = repr(data)
  308.         if len(r) > 72:
  309.             r = r[:35] + '...' + r[-35:]
  310.         
  311.         print 'data:', r
  312.  
  313.     
  314.     def handle_comment(self, data):
  315.         r = repr(data)
  316.         if len(r) > 68:
  317.             r = r[:32] + '...' + r[-32:]
  318.         
  319.         print 'comment:', r
  320.  
  321.     
  322.     def unknown_starttag(self, tag, attrs):
  323.         print 'start tag: <' + tag,
  324.         for name, value in attrs:
  325.             print name + '=' + '"' + value + '"',
  326.         
  327.         print '>'
  328.  
  329.     
  330.     def unknown_endtag(self, tag):
  331.         print 'end tag: </' + tag + '>'
  332.  
  333.     
  334.     def unknown_entityref(self, ref):
  335.         print '*** unknown entity ref: &' + ref + ';'
  336.  
  337.     
  338.     def unknown_charref(self, ref):
  339.         print '*** unknown char ref: &#' + ref + ';'
  340.  
  341.  
  342.  
  343. def test():
  344.     file = 'test.html'
  345.     f = open(file, 'r')
  346.     x = TestSGML()
  347.     while 1:
  348.         line = f.readline()
  349.         if not line:
  350.             x.close()
  351.             break
  352.         
  353.         x.feed(line)
  354.  
  355.